* Model count data from GSS and HILDA
* ROC curves http://gim.unmc.edu/dxtests/roc3.htm

local working H:\Documents\HILDA Project\ARC Methodology\UQ research\Chapter 3 - Mobility\GSS 2014
local workhilda H:\Documents\HILDA Project\Data\HILDA Working\Release 16\residential mobility
local hildadir "H:\Documents\HILDA Project\Data\HILDA Release\Release 16\Stata 160c"

capture log close
log using "H:\Documents\HILDA Project\ARC Methodology\UQ research\Chapter 3 - Mobility\GSS 2014\GSSmodelcountdata.log", replace

*pause on
set more off

* restrict dataset to just key variables (to avoid as many zero cells as possible)
* Get HILDA dataset, create counts and append GSS counts
* First, for people who age into being eligible for interview, include in longitudinal sample with appropriate weight
use xwaveid wlrb_n wlrc_n wlrd_n wlre_n wlrf_n wlrg_n wlrh_n wlri_n wlrj_n wlrk_n wlrl_n wlrm_n using "`hildadir'\longitudinal_weights_p160c.dta", clear
destring(xwaveid), replace
merge 1:m xwaveid using "`workhilda'\move_long"
drop _merge
* restrict to sample equivalent to GSS (leave age restiction to later)
keep if ageg>=4 & wave==14

* modify longitudinal responding person weight
tab hgage if lnwtrp>0
gen lnwtrpr2=lnwtrp
replace lnwtrpr2=wlrb_n if hgage==27
replace lnwtrpr2=wlrc_n if hgage==26
replace lnwtrpr2=wlrd_n if hgage==25
replace lnwtrpr2=wlre_n if hgage==24
replace lnwtrpr2=wlrf_n if hgage==23
replace lnwtrpr2=wlrg_n if hgage==22
replace lnwtrpr2=wlrh_n if hgage==21
replace lnwtrpr2=wlri_n if hgage==20
replace lnwtrpr2=wlrj_n if hgage==19
replace lnwtrpr2=wlrk_n if hgage==18
replace lnwtrpr2=wlrl_n if hgage==17
replace lnwtrpr2=wlrm_n if hgage==16
replace lnwtrpr2=hhwtrp if hgage==15

tab hgage if lnwtrpr2>0

* Note - this revision to the weights is now done in the setup program and the new weight is lnwtrpr
corr lnwtrpr lnwtrpr2

* calculated effective sample size
gen lnwtrprsq=lnwtrpr^2
sum lnwtrpr* if lnwtrpr>0
display 1847.906*9174
display (1847.906*9174)^2/(4826990*9174)
* ESS=6490 (9174 respondents, 16,952,690 population)

* Check population that will be excluded (aged 15-24)
sum lnwtrpr if lnwtrpr>0 & ageg>=4 & ageg<=5
* population aged 15-24 is 2083*1603.94=3,341,007 so population 25+ is 13,611,683

*own (26 missing as DK/Ref)
gen own=1 if hstenr==1 | hstenr==3
replace own=0 if hstenr==2 | hstenr==4

* bachelor & above (assume unknown are not bachelor
gen bach=1 if edhigh1>=1 & edhigh1<=3
replace bach=0 if edhigh1>=4 & edhigh1<=10

* create count dataset and include variables: agegrp own bach move5y count
gen count=1
sum count if lnwtrpr>0
* count=9174

sum ageg own bach pmove5 lnwtrpr

collapse (sum) count if lnwtrpr>=0 [pweight=lnwtrpr], by(ageg own bach pmove5)
* revise count to equivalent sample size 
replace count=count*6490/16952690
sum count
* display 166*39.09639 gives 6490
drop if own==. | bach==. | pmove5==.
* restict to age range equivalent to GSS
keep if ageg>=6
sum count
* display 104*49.18859 gives 5116 (reduced from 6490 due to age restriction and missing values)

rename ageg agegrp
rename pmove5 move5y


* flag dataset (1=HILDA, 2= GSS)
gen ds=1
label variable ds "Source dataset"
label define d 1 "HILDA" 2 "GSS" 3 "ACLD"
label values ds d

append using "`working'\GSScountdata_move5y3.dta"
replace ds=2 if ds==.
tab ds
sum count if ds==2
* count=104*148.4=15,434.1 (in thousands)
* GSS count is higher than HILDA equivalent age as GSS includes:
* i) people who were born in Australia or first arrived in Australia before 2004, then went overseas after 2004, and were in Australia in 2014 (about 200,000).
* ii) people who were overseas in 2001 and had returned by 2014 (about 200,000).
* iii) people who were in institutions / very remote Australia in 2001 but who moved out of these areas by 2014 (number unknown, but lowish).
* iv) people who arrived second half 2001 to 2003 (about 300,000 permanent arrivals and some temporary arrivals who turned into permanent arrivals perhaps 150,000=450,000) and were in Australia in 2014
* HILDA includes:
* i) people who have moved into institutions by 2014
* ii) people who have moved into very remote Australia by 2014
* HILDA excludes:
* i) people who were living overseas in last 5 years (otherwise pmove5 could not be calculated) (about 100,000 per year returning from overseas, 82% away for 2 years so number away overseas 
* for a year or more could be 1,000,000)
* Both GSS and HILDA exclude NPD and very remote (in 2014 for GSS and in 2011 for HILDA). ERP excludes NPD and very remote of NT (about 20% of state).

* revise count to equivalent sample size (would want to further adjust for sample design)
* GSS sample=12932 (one individual per responding HH) but oversampled smaller states, so assume SRS of 10000.
* GSS documentation does not provide effective sample size or split of sample by state and targeted low socio-economic areas.
replace count=count*10000/15434.1 if ds==2
sum count if ds==2
* count=104*96.154=10000

capture drop approxcount
gen approxcount=round(count,1)

* summary of observed data
gen approxcount_move=0
gen approxcount_nomove=0
replace approxcount_move=approxcount if move5y==1
replace approxcount_nomove=approxcount if move5y==0
bys ds agegrp own: egen summove=sum(approxcount_move) 
bys ds agegrp own: egen sumnomove=sum(approxcount_nomove)
gen move5yobs=summove/(summove+sumnomove)

* model cubic splines
bspline, xvar(agegrp) knots(6 10 18) gen(bs) power(3)
xi:logit move5y own#c.bs1 own#c.bs2 own#c.bs3 own#c.bs4 own#c.bs5 [fweight=approxcount] if ds==1
estimates store m3
lroc
estat ic
pause

preserve
keep if ds==1
predict move5yhat
predict stdp, stdp
gen se = move5yhat * (1-move5yhat) * stdp
gen lb = move5yhat - 1.96*se
gen ub = move5yhat + 1.96*se
replace lb = 0 if lb<0
graph twoway /*rarea lb ub agegrp if own==0 /*& lb>=0 & ub<=1 */, sort fintensity(10) ///
          || rarea lb ub agegrp if own==1 /*& lb>=0 & ub<=1 */, sort fintensity(10) ///
		  */|| line move5yhat agegrp if own==0, sort ///
          || line move5yhat agegrp if own==1, sort ///
		  || line move5yobs agegrp if own==0, sort ///
		  || line move5yobs agegrp if own==1, sort ///
		  xlabel(6 (3) 18) xtick(6 (3) 18) ylabel(0 (0.1) 1) ytick(0 (0.1) 1) ///
		  legend(label(1 "model-not own") label(2 "model-own") label(3 "obs-not own") label(4 "obs-own")) ///
		  ytitle("Proportion moving in the last 5 years") xtitle("Age")  
graph save "`working'\HILDAmove5y3_own", replace   
restore
pause

xi:logit move5y own#c.bs1 own#c.bs2 own#c.bs3 own#c.bs4 own#c.bs5 [fweight=approxcount] if ds==2
estimates store m3
lroc
estat ic
pause

preserve
keep if ds==2
predict move5yhat
predict stdp, stdp
gen se = move5yhat * (1-move5yhat) * stdp
gen lb = move5yhat - 1.96*se
gen ub = move5yhat + 1.96*se
replace lb = 0 if lb<0
graph twoway /*rarea lb ub agegrp if own==0 /*& lb>=0 & ub<=1 */, sort fintensity(10) ///
          || rarea lb ub agegrp if own==1 /*& lb>=0 & ub<=1 */, sort fintensity(10) ///
		  */|| line move5yhat agegrp if own==0, sort ///
          || line move5yhat agegrp if own==1, sort ///
		  || line move5yobs agegrp if own==0, sort ///
		  || line move5yobs agegrp if own==1, sort ///
		  xlabel(6 (3) 18) xtick(6 (3) 18) ylabel(0 (0.1) 1) ytick(0 (0.1) 1) ///
		  legend(label(1 "model-not own") label(2 "model-own") label(3 "obs-not own") label(4 "obs-own")) ///
		  ytitle("Proportion moving in the last 5 years") xtitle("Age")  
graph save "`working'\GSSmove5y3_own", replace
restore
pause

* effect of demographic factors, after adjusting for agegrp and sex
xi:logit move5y ds#own#c.bs1 ds#own#c.bs2 ds#own#c.bs3 ds#own#c.bs4 ds#own#c.bs5 ds#bach [fweight=approxcount]
lroc
estat ic
*pause

* effect of demographic factors, after adjusting for agegrp and sex
/*
xi:logit move5y ds#own#c.bs1 ds#own#c.bs2 ds#own#c.bs3 ds#own#c.bs4 ds#own#c.bs5 ds#bach#c.bs1 ds#bach#c.bs2 ds#bach#c.bs3 ds#bach#c.bs4 ds#bach#c.bs5 [fweight=approxcount]
lroc
estat ic
*/
xi:logit move5y ds#own#c.bs1 ds#own#c.bs2 ds#own#c.bs3 ds#own#c.bs4 ds#own#c.bs5 ds#own#bach#c.bs1 ds#own#bach#c.bs2 ds#own#bach#c.bs3 ds#own#bach#c.bs4 ds#own#bach#c.bs5 [fweight=approxcount]
lroc
estat ic

xi:logit move5y ds#own#c.bs1 ds#own#c.bs2 ds#own#c.bs3 ds#own#c.bs4 ds#own#c.bs5 ds#bach [fweight=approxcount]
lroc
estat ic

xi:logit move5y own#c.bs1 own#c.bs2 own#c.bs3 own#c.bs4 own#c.bs5 own#bach [fweight=approxcount] if ds==1
xi:logit move5y ds#own#c.bs1 ds#own#c.bs2 ds#own#c.bs3 ds#own#c.bs4 ds#own#c.bs5 ds#own#bach [fweight=approxcount]
lroc
estat ic

test [move5y]1b.ds#0b.own#1.bach=[move5y]2.ds#0b.own#1.bach
* Not significant difference between dataset for not owning when have bachelor

test [move5y]1b.ds#0b.own#0b.bach=[move5y]2.ds#0b.own#0b.bach
* Significant diference between datasets for not owning when dont have bachelor

test [move5y]1b.ds#1.own#0b.bach = [move5y]2.ds#1.own#0b.bach
* Not significant difference betweend datasets for owning when dont have bachelor

test [move5y]1b.ds#1.own#1.bach = [move5y]2.ds#1.own#1.bach
* Constraint dropped (Not significant difference betweend datasets for owning when have bachelor)

capture drop approxcount
gen approxcount=round(count,1)

* summary of observed data
drop approxcount_move approxcount_nomove summove sumnomove move5yobs
gen approxcount_move=0
gen approxcount_nomove=0
replace approxcount_move=approxcount if move5y==1
replace approxcount_nomove=approxcount if move5y==0
bys agegrp own bach: egen summove=sum(approxcount_move) 
bys agegrp own bach: egen sumnomove=sum(approxcount_nomove)
gen move5yobs=summove/(summove+sumnomove)

*drop move5yhat stdp se lb ub
predict move5yhat
predict stdp, stdp
gen se = move5yhat * (1-move5yhat) * stdp
gen lb = move5yhat - 1.96*se
gen ub = move5yhat + 1.96*se
replace lb = 0 if lb<0
preserve
keep if ds==1
graph twoway line move5yhat agegrp if own==0 & bach==0, sort lcolor(midblue) ///
          || line move5yhat agegrp if own==1 & bach==0, sort lcolor(cranberry) ///
		  || line lb agegrp if own==0 & bach==0, sort lcolor(midblue) lpattern(shortdash) lwidth(thin) ///
		  || line ub agegrp if own==0 & bach==0, sort lcolor(midblue) lpattern(shortdash) lwidth(thin) ///
		  || line lb agegrp if own==1 & bach==0, sort lcolor(cranberry) lpattern(shortdash) lwidth(thin) ///
		  || line ub agegrp if own==1 & bach==0, sort lcolor(cranberry) lpattern(shortdash) lwidth(thin) ///
		  || line move5yhat agegrp if own==0 & bach==1, sort lcolor(midgreen) ///
          || line move5yhat agegrp if own==1 & bach==1, sort lcolor(gold) ///
		  || line lb agegrp if own==0 & bach==1, sort lpattern(shortdash) lcolor(midgreen) lwidth(thin) ///
		  || line ub agegrp if own==0 & bach==1, sort lpattern(shortdash) lcolor(midgreen) lwidth(thin) ///
		  || line lb agegrp if own==1 & bach==1, sort lpattern(shortdash) lcolor(gold) lwidth(thin) ///
		  || line ub agegrp if own==1 & bach==1, sort lpattern(shortdash) lcolor(gold) lwidth(thin) ///
		  xlabel(4 "15-19" /* 5 "20-24" */ 6 "25-29" /*7 "30-34"*/ 8 "35-39" /*9 "40-44"*/ 10 "45-49" /*11 "50-54"*/ 12 "55-59" /*13 "60-64"*/ 14 "65-69" /*15 "70-74"*/ 16 "75-79" /*17 "80-84"*/ 18 "85+") ///
          xtick(4 (2) 18) ylabel(0 (0.1) 1) ytick(0 (0.1) 1) ///
		  legend(label(1 "Not own, not bach") label(2 "Own, not bach") label(3 "CI LB") label(4 "CI UB") label(5 "CI LB") label(6 "CI UB") ///
		         label(7 "Not own, bach") label(8 "Own, bach") label(9 "CI LB") label(10 "CI UB") label(11 "CI LB") label(12 "CI UB")   ///
				 order(1 2 7 8 3 5 9 11 4 6 10 12) size(2) col(4))   ///
	      graphregion(color(white)) ///
		  ytitle("Proportion moving in the last 5 years") xtitle("Age")  title("HILDA 2014") name(hilda, replace)  
graph save "`working'\HILDAmove5y3_mown", replace
*gray scale: midblue=gs0, cranberry=gs4, midgreen=gs8, gold=gs12
graph twoway line move5yhat agegrp if own==0 & bach==0, sort lcolor(gs0) ///
          || line move5yhat agegrp if own==1 & bach==0, sort lcolor(gs4) lpattern(dash_dot) ///
		  || line lb agegrp if own==0 & bach==0, sort lcolor(gs0) lpattern(dot) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==0 & bach==0, sort lcolor(gs0) lpattern(dot) /*lwidth(thin)*/ ///
		  || line lb agegrp if own==1 & bach==0, sort lcolor(gs4) lpattern(dot) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==1 & bach==0, sort lcolor(gs4) lpattern(dot) /*lwidth(thin)*/ ///
		  || line move5yhat agegrp if own==0 & bach==1, sort lcolor(gs8) ///
          || line move5yhat agegrp if own==1 & bach==1, sort lcolor(gs12) lpattern(dash_dot) ///
		  || line lb agegrp if own==0 & bach==1, sort lpattern(dot) lcolor(gs8) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==0 & bach==1, sort lpattern(dot) lcolor(gs8) /*lwidth(thin)*/ ///
		  || line lb agegrp if own==1 & bach==1, sort lpattern(dot) lcolor(gs12) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==1 & bach==1, sort lpattern(dot) lcolor(gs12) /*lwidth(thin)*/ ///
		  xlabel(4 "15-19" /* 5 "20-24" */ 6 "25-29" /*7 "30-34"*/ 8 "35-39" /*9 "40-44"*/ 10 "45-49" /*11 "50-54"*/ 12 "55-59" /*13 "60-64"*/ 14 "65-69" /*15 "70-74"*/ 16 "75-79" /*17 "80-84"*/ 18 "85+") ///
          xtick(4 (2) 18) ylabel(0 (0.1) 1) ytick(0 (0.1) 1) ///
		  legend(label(1 "Not own, not bach") label(2 "Own, not bach") label(3 "CI LB") label(4 "CI UB") label(5 "CI LB") label(6 "CI UB") ///
		         label(7 "Not own, bach") label(8 "Own, bach") label(9 "CI LB") label(10 "CI UB") label(11 "CI LB") label(12 "CI UB")   ///
				 order(1 2 7 8 3 5 9 11 4 6 10 12) size(2) col(4))   ///
	      graphregion(color(white)) ///
		  ytitle("Proportion moving in the last 5 years") xtitle("Age") title("HILDA 2014") name(hilda_gs, replace)  
graph save "`working'\HILDAmove5y3_mown_gs", replace
restore
preserve
keep if ds==2
graph twoway line move5yhat agegrp if own==0 & bach==0, sort lcolor(midblue) ///
          || line move5yhat agegrp if own==1 & bach==0, sort lcolor(cranberry) ///
		  || line lb agegrp if own==0 & bach==0, sort lcolor(midblue) lpattern(shortdash) lwidth(thin) ///
		  || line ub agegrp if own==0 & bach==0, sort lcolor(midblue) lpattern(shortdash) lwidth(thin) ///
		  || line lb agegrp if own==1 & bach==0, sort lcolor(cranberry) lpattern(shortdash) lwidth(thin) ///
		  || line ub agegrp if own==1 & bach==0, sort lcolor(cranberry) lpattern(shortdash) lwidth(thin) ///
		  || line move5yhat agegrp if own==0 & bach==1, sort lcolor(midgreen) ///
          || line move5yhat agegrp if own==1 & bach==1, sort lcolor(gold) ///
		  || line lb agegrp if own==0 & bach==1, sort lpattern(shortdash) lcolor(midgreen) lwidth(thin) ///
		  || line ub agegrp if own==0 & bach==1, sort lpattern(shortdash) lcolor(midgreen) lwidth(thin) ///
		  || line lb agegrp if own==1 & bach==1, sort lpattern(shortdash) lcolor(gold) lwidth(thin) ///
		  || line ub agegrp if own==1 & bach==1, sort lpattern(shortdash) lcolor(gold) lwidth(thin) ///
		  xlabel(4 "15-19" /* 5 "20-24" */ 6 "25-29" /*7 "30-34"*/ 8 "35-39" /*9 "40-44"*/ 10 "45-49" /*11 "50-54"*/ 12 "55-59" /*13 "60-64"*/ 14 "65-69" /*15 "70-74"*/ 16 "75-79" /*17 "80-84"*/ 18 "85+") ///
          xtick(4 (2) 18) ylabel(0 (0.1) 1) ytick(0 (0.1) 1) ///
		  legend(label(1 "Not own, not bach") label(2 "Own, not bach") label(3 "CI LB") label(4 "CI UB") label(5 "CI LB") label(6 "CI UB") ///
		         label(7 "Not own, bach") label(8 "Own, bach") label(9 "CI LB") label(10 "CI UB") label(11 "CI LB") label(12 "CI UB")   ///
				 order(1 2 7 8 3 5 9 11 4 6 10 12) size(2) col(4))   ///
	      graphregion(color(white)) ///
		  ytitle("Proportion moving in the last 5 years") xtitle("Age")  title("GSS 2014") name(gss, replace)  
graph save "`working'\GSSmove5y3_mown", replace
graph twoway line move5yhat agegrp if own==0 & bach==0, sort lcolor(gs0) ///
          || line move5yhat agegrp if own==1 & bach==0, sort lcolor(gs4) lpattern(dash_dot) ///
		  || line lb agegrp if own==0 & bach==0, sort lcolor(gs0) lpattern(dot) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==0 & bach==0, sort lcolor(gs0) lpattern(dot) /*lwidth(thin)*/ ///
		  || line lb agegrp if own==1 & bach==0, sort lcolor(gs4) lpattern(dot) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==1 & bach==0, sort lcolor(gs4) lpattern(dot) /*lwidth(thin)*/ ///
		  || line move5yhat agegrp if own==0 & bach==1, sort lcolor(gs8) ///
          || line move5yhat agegrp if own==1 & bach==1, sort lcolor(gs12) lpattern(dash_dot) ///
		  || line lb agegrp if own==0 & bach==1, sort lpattern(dot) lcolor(gs8) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==0 & bach==1, sort lpattern(dot) lcolor(gs8) /*lwidth(thin)*/ ///
		  || line lb agegrp if own==1 & bach==1, sort lpattern(dot) lcolor(gs12) /*lwidth(thin)*/ ///
		  || line ub agegrp if own==1 & bach==1, sort lpattern(dot) lcolor(gs12) /*lwidth(thin)*/ ///
		  xlabel(4 "15-19" /* 5 "20-24" */ 6 "25-29" /*7 "30-34"*/ 8 "35-39" /*9 "40-44"*/ 10 "45-49" /*11 "50-54"*/ 12 "55-59" /*13 "60-64"*/ 14 "65-69" /*15 "70-74"*/ 16 "75-79" /*17 "80-84"*/ 18 "85+") ///
          xtick(4 (2) 18) ylabel(0 (0.1) 1) ytick(0 (0.1) 1) ///
		  legend(label(1 "Not own, not bach") label(2 "Own, not bach") label(3 "CI LB") label(4 "CI UB") label(5 "CI LB") label(6 "CI UB") ///
		         label(7 "Not own, bach") label(8 "Own, bach") label(9 "CI LB") label(10 "CI UB") label(11 "CI LB") label(12 "CI UB")   ///
				 order(1 2 7 8 3 5 9 11 4 6 10 12) size(2) col(4))   ///
	      graphregion(color(white)) ///
		  ytitle("Proportion moving in the last 5 years") xtitle("Age")  title("GSS 2014") name(gss_gs, replace)  
graph save "`working'\GSSmove5y3_mown_gs", replace
restore
grc1leg gss hilda, /*title("Five year residential mobility")*/ name(gss_hilda, replace) graphregion(fcolor(white))
graph display, xsize(20) ysize(10)
graph export "`working'\HILDA_GSSmove5y_mown.png", replace
graph export "`working'\HILDA_GSSmove5y_mown.svg", replace
grc1leg gss_gs hilda_gs, /*title("Five year residential mobility")*/ name(gss_hilda_gs, replace) graphregion(fcolor(white))
graph display, xsize(20) ysize(10)
graph export "`working'\HILDA_GSSmove5y_mown_gs.png", replace
graph export "`working'\HILDA_GSSmove5y_mown_gs.svg", replace



log close
